First, we load all of the necessary libaries.
library("ggplot2")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tidyr")
library("lubridate")
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library("geosphere")
## Loading required package: sp
library("ggmap")
library("gridExtra")
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Next, we read in the dataset we will be using. We will use the citibike data from Jan 2016 - Sept 2016.
citi <- read.csv("sample_data.csv")
head(citi)
## X tripduration starttime stoptime
## 1 4763695 905 6/13/2016 07:22:26 6/13/2016 07:37:31
## 2 2734462 266 4/23/2016 21:45:29 4/23/2016 21:49:56
## 3 2381019 148 4/15/2016 15:55:13 4/15/2016 15:57:42
## 4 10242531 425 9/30/2016 05:54:09 9/30/2016 06:01:14
## 5 2370251 450 4/15/2016 10:36:52 4/15/2016 10:44:23
## 6 540909 406 2/2/2016 09:12:29 2/2/2016 09:19:15
## start.station.id start.station.name start.station.latitude
## 1 3153 E 71 St & 2 Ave 40.76818
## 2 483 E 12 St & 3 Ave 40.73223
## 3 3223 E 55 St & 3 Ave 40.75900
## 4 529 W 42 St & 8 Ave 40.75757
## 5 305 E 58 St & 3 Ave 40.76096
## 6 491 E 24 St & Park Ave S 40.74096
## start.station.longitude end.station.id end.station.name
## 1 -73.95910 457 Broadway & W 58 St
## 2 -73.98890 504 1 Ave & E 15 St
## 3 -73.96865 456 E 53 St & Madison Ave
## 4 -73.99099 520 W 52 St & 5 Ave
## 5 -73.96724 519 Pershing Square North
## 6 -73.98602 483 E 12 St & 3 Ave
## end.station.latitude end.station.longitude bikeid usertype birth.year
## 1 40.76695 -73.98169 18198 Subscriber 1989
## 2 40.73222 -73.98166 21628 Subscriber 1996
## 3 40.75971 -73.97402 23353 Subscriber 1983
## 4 40.75992 -73.97649 16698 Subscriber 1960
## 5 40.75187 -73.97771 16941 Subscriber 1980
## 6 40.73223 -73.98890 22819 Subscriber 1986
## gender
## 1 1
## 2 1
## 3 1
## 4 1
## 5 2
## 6 1
#Calculate Distance
citi$dist <- distHaversine(citi[, c("start.station.longitude", "start.station.latitude") ], citi[ ,c ("end.station.longitude", "end.station.latitude") ])
#citi$bearing <-bearingRhumb(citi[, c("start.station.longitude", "start.station.latitude") ], citi[ ,c ("end.station.longitude", "end.station.latitude") ])
#degToCompass <- function(num) {
# val <- (num/22.5)+.5
# arr <- c("N","NNE","NE","ENE","E","ESE", "SE", "SSE","S","SSW","SW","WSW","W","WNW","NW","NNW")
# return(arr[val % 16])
#}
#Time conversion
citi <- citi %>% separate(starttime, c("StartDate", "StartTime"), sep = " ")
citi$StartDate <- as.Date(citi$StartDate, format = '%m/%d/%Y')
citi <- citi %>% separate(StartTime, c("StartHour", "StartMin"), sep = ":")
## Warning: Too many values at 10000 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
citi$StartHour <- as.numeric(citi$StartHour)
citi$StartMin <- as.numeric(citi$StartMin)
citi$DayOfWeek <- weekdays(citi$StartDate)
Now we need to clean the data. Some of the data we are looking out for is:
Trips that last for extreme times
When dealing with ages only use subscriber data, customer data has no age information. This situation needs to be handled only when using gender as a parameter.
citi <- citi %>%
filter(tripduration < 5400 & !(tripduration < 90 & start.station.id == end.station.id))
Lets Read in the data
summary(citi)
## X tripduration StartDate StartHour
## Min. : 158 Min. : 62.0 Min. :2016-01-01 Min. : 0.00
## 1st Qu.: 2554261 1st Qu.: 385.0 1st Qu.:2016-04-19 1st Qu.:10.00
## Median : 5094710 Median : 634.0 Median :2016-06-19 Median :15.00
## Mean : 5121090 Mean : 810.3 Mean :2016-06-11 Mean :13.95
## 3rd Qu.: 7713365 3rd Qu.:1069.2 3rd Qu.:2016-08-14 3rd Qu.:18.00
## Max. :10262644 Max. :5398.0 Max. :2016-09-30 Max. :23.00
##
## StartMin stoptime start.station.id
## Min. : 0.00 1/21/2016 13:32:50: 2 Min. : 72.0
## 1st Qu.:15.00 1/28/2016 17:35:06: 2 1st Qu.: 331.0
## Median :29.00 7/4/2016 18:34:18 : 2 Median : 459.0
## Mean :29.51 8/15/2016 20:43:43: 2 Mean : 998.5
## 3rd Qu.:45.00 8/23/2016 08:52:28: 2 3rd Qu.: 536.2
## Max. :59.00 9/10/2016 11:45:07: 2 Max. :3431.0
## (Other) :9920
## start.station.name start.station.latitude
## Pershing Square North: 100 Min. :40.65
## E 17 St & Broadway : 81 1st Qu.:40.72
## W 21 St & 6 Ave : 79 Median :40.74
## Broadway & E 14 St : 70 Mean :40.74
## Broadway & E 22 St : 67 3rd Qu.:40.75
## 12 Ave & W 40 St : 63 Max. :40.80
## (Other) :9472
## start.station.longitude end.station.id end.station.name
## Min. :-74.02 Min. : 72.0 Pershing Square North: 109
## 1st Qu.:-74.00 1st Qu.: 329.0 E 17 St & Broadway : 82
## Median :-73.99 Median : 456.0 Broadway & E 22 St : 72
## Mean :-73.99 Mean : 965.4 West St & Chambers St: 72
## 3rd Qu.:-73.98 3rd Qu.: 530.0 W 21 St & 6 Ave : 70
## Max. :-73.93 Max. :3431.0 W 20 St & 11 Ave : 69
## (Other) :9458
## end.station.latitude end.station.longitude bikeid
## Min. :40.65 Min. :-74.02 Min. :14530
## 1st Qu.:40.72 1st Qu.:-74.00 1st Qu.:17698
## Median :40.74 Median :-73.99 Median :20840
## Mean :40.74 Mean :-73.99 Mean :20671
## 3rd Qu.:40.75 3rd Qu.:-73.98 3rd Qu.:23555
## Max. :40.80 Max. :-73.93 Max. :27307
##
## usertype birth.year gender dist
## Customer :1167 Min. :1885 Min. :0.000 Min. : 0.0
## Subscriber:8765 1st Qu.:1970 1st Qu.:1.000 1st Qu.: 884.1
## Median :1981 Median :1.000 Median : 1437.6
## Mean :1978 Mean :1.087 Mean : 1799.6
## 3rd Qu.:1987 3rd Qu.:1.000 3rd Qu.: 2326.6
## Max. :2000 Max. :2.000 Max. :10016.4
## NA's :1228
## DayOfWeek
## Length:9932
## Class :character
## Mode :character
##
##
##
##
#Distribution of age
ggplot(citi) + geom_histogram(aes(birth.year))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1228 rows containing non-finite values (stat_bin).
#Citibike users are categorized depending into either "Customer" or "Subscriber" depending on their engagement status. A visualiztion of the distribution is shown below.
ggplot(citi) + geom_bar(aes(usertype, y = (..count..)/sum(..count..)), fill="blue", colour="darkblue", width=0.2) + ylab("Proportion") +xlab("Usertype")
#Gender distribution of users
levels(citi$gender) <- c("UNKNOWN", "MALE", "FEMALE")
ggplot(citi) + geom_bar(aes(gender, y = (..count..)/sum(..count..)), fill="blue") + ylab("Proportion") +xlab("Gender") + theme_bw()
#Distribution of duration of trips
citi$tripdurationinteger <- as.integer(citi$tripduration)
citi <- mutate(citi, tripdurationinteger.min = tripdurationinteger/60)
x.max <- quantile(citi$tripdurationinteger.min, 0.99)
ggplot(citi) + geom_histogram(aes(tripdurationinteger)) + xlim(c(0, 5000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).
#Distribution of gender displayed over birth year
citi %>% filter(gender != "0") %>% ggplot(.) + geom_density(aes(birth.year,group=gender, fill=gender, colour=gender), adjust=3, alpha=0.1) + xlab("Year of Birth") + ylab("Density")
stations <- citi %>%
group_by(start.station.name , StartHour, start.station.longitude, start.station.latitude) %>%
summarise(numRides = n() ) %>%
mutate(ridesPerMinuteInHour = numRides / 60) %>%
arrange(desc(ridesPerMinuteInHour))
top <- stations[0: 1000,]
nyc_base <- ggmap::get_map("New York City", zoom = 13)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York+City&zoom=13&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York%20City&sensor=false
ggmap(nyc_base) + geom_point(data=top, aes(x=start.station.longitude, y=start.station.latitude, colour=ridesPerMinuteInHour), size=4, alpha=0.6)
## Warning: Removed 264 rows containing missing values (geom_point).
Here we plot the top 100 locations with the busiest rides per hour throughout the day. We only see about 12 locations, which means that some of these appears multiple times. All of these are in the southern point of manhattan.
What are the most common routes? From above, we saw that the weekdays are most busy and that it appears that this comes from the rush hour traffic. Lets first do some analysis on how many different routes there are.
#
routes <- citi %>%
group_by(start.station.name, end.station.name, start.station.latitude, start.station.longitude, end.station.latitude, end.station.longitude,) %>%
summarise(numberOfTrips = n(), lengthTrip = mean(dist), avg_duration = mean(tripduration))
nrow(routes)
## [1] 8615
#There are 36,668 routes
How many routes return to themselves?
#filter(routes, start.station.name == end.station.name) %>%
# arrange(desc(numberOfTrips))
Biking around central park seems like the most popular loop. Biking around Reade St is towards the southern tip of Manhattan
What is the most frequented Route?
routes <- arrange(routes, desc(numberOfTrips))
topRoutes <- routes[0:500, ]
head(topRoutes)
## # A tibble: 6 x 9
## # Groups: start.station.name, end.station.name, start.station.latitude,
## # start.station.longitude, end.station.latitude [6]
## start.station.name end.station.name
## <fctr> <fctr>
## 1 Central Park S & 6 Ave Central Park S & 6 Ave
## 2 N 6 St & Bedford Ave S 4 St & Wythe Ave
## 3 N 6 St & Bedford Ave Wythe Ave & Metropolitan Ave
## 4 Pershing Square North W 41 St & 8 Ave
## 5 Central Park S & 6 Ave 5 Ave & E 78 St
## 6 Clark St & Henry St Henry St & Atlantic Ave
## # ... with 7 more variables: start.station.latitude <dbl>,
## # start.station.longitude <dbl>, end.station.latitude <dbl>,
## # end.station.longitude <dbl>, numberOfTrips <int>, lengthTrip <dbl>,
## # avg_duration <dbl>
The top station is from Grand Central Station, which makes sense since most commuters will probably be going from work Can we map the top 100 most frequented loop?
#nyc_base <- ggmap::get_map("New York City", zoom = 14)
#ggplot(data=topRoutes, aes(x=start.station.longitude, y=start.station.latitude, fill=numberOfTrips)) + geom_polygon()
ggmap(nyc_base) + geom_point(data=topRoutes, aes(x=start.station.longitude, y=start.station.latitude, colour=numberOfTrips), size=3, alpha=0.5)
## Warning: Removed 124 rows containing missing values (geom_point).
Using the zillow dataset of all the neighborhoods in NYC, we can load them and see there are 19.
library(rgdal, quietly = TRUE, warn.conflicts = FALSE)
## Warning: package 'rgdal' was built under R version 3.4.2
## rgdal: version: 1.2-13, (SVN revision 686)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.1.3, released 2017/20/01
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/rgdal/proj
## Linking to sp version: 1.2-5
ny.map <- readOGR("neighborhoods-in-new-york/ZillowNeighborhoods-NY.shp", layer = "ZillowNeighborhoods-NY")
## OGR data source with driver: ESRI Shapefile
## Source: "neighborhoods-in-new-york/ZillowNeighborhoods-NY.shp", layer: "ZillowNeighborhoods-NY"
## with 579 features
## It has 5 fields
neighborhoods <- ny.map[ny.map$City == "New York", ]
neighborhood_names <- levels(neighborhoods$Name)
print(head(neighborhood_names, n = 12))
## [1] "19th Ward" "Abbott McKinley" "Airmont"
## [4] "Albright" "Alcove" "Allen"
## [7] "Allentown" "Altamont" "Annadale"
## [10] "Aquebogue" "Arbor Hill" "Arden Heights"
find_neighborhoods <- function(df, long_feature, lat_feature, neighborhood_feature) {
dat <- data.frame(long = df[long_feature][[1]], lat = df[lat_feature][[1]])
coordinates(dat) <- ~ long + lat
proj4string(dat) <- proj4string(neighborhoods)
df[neighborhood_feature] <- over(dat, neighborhoods)$Name
levels(df[[neighborhood_feature]]) <- c(levels(df[[neighborhood_feature]]), "Unknown")
df[[neighborhood_feature]][is.na(df[[neighborhood_feature]])] <- "Unknown"
return(df)
}
citi = find_neighborhoods(citi, "start.station.longitude", "start.station.latitude", "pickup_neighborhood")
citi = find_neighborhoods(citi, "end.station.longitude", "end.station.longitude", "dropoff_neighborhood")
#citi = find_neighborhoods(citi, "pickup_longitude", "pickup_latitude", "pickup_neighborhood")
#citi = find_neighborhoods(citi, "dropoff_longitude", "dropoff_latitude", "dropoff_neighborhood")
Now that we have all of the neighborhooad information, lets see the most popular pickup neighborhoods. It appears that the most popular neighborhoods are in the lower part of manhattan. These must be more bikeable areas.
sort(table(citi$pickup_neighborhood), decreasing = TRUE)[1:16]
##
## Chelsea Flatiron District Greenwich Village
## 736 646 594
## Midtown Lower East Side East Village
## 589 544 525
## Garment District Upper East Side Gramercy
## 475 454 438
## Clinton Upper West Side Williamsburg
## 428 399 384
## West Village Tribeca Financial District
## 381 336 309
## Murray Hill
## 252
nyc_base <- ggmap::get_map("New York City", zoom = 1)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York+City&zoom=1&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York%20City&sensor=false
stations <- citi %>%
group_by(start.station.name , StartHour, start.station.longitude, start.station.latitude) %>%
summarise(numRides = n() ) %>%
arrange(desc(numRides))
top <- stations[0: 2000,]
#top
nyc <- get_map("Herald Square, new york", zoom=12)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=Herald+Square,+new+york&zoom=12&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Herald%20Square,%20new%20york&sensor=false
nyc.map <- ggmap(nyc)
startStation <- nyc.map + stat_density2d(aes(x= start.station.longitude , y= start.station.latitude, fill = ..level..), alpha=0.6,
size = 2, bins = 8, data=citi, geom="polygon") + scale_fill_gradient(low="pink", high="purple4")
#ggmap(nyc_base) + geom_point(data=top, aes(x=start.station.longitude, y=start.station.latitude, colour=numRides), size=4, alpha=0.3)
nyc.map + stat_density2d(aes(x= end.station.longitude , y= end.station.latitude, fill = ..level..), alpha=0.6,
size = 2, bins = 8, data=citi, geom="polygon") + scale_fill_gradient(low="pink", high="purple4")
## Warning: Removed 5 rows containing non-finite values (stat_density2d).
trip_duration_by_station <- citi %>%
group_by(start.station.name, start.station.longitude, start.station.latitude) %>%
summarise(median_trip = median(tripduration))
ggmap(nyc) + geom_point(data=trip_duration_by_station, aes(x=start.station.longitude, y=start.station.latitude, colour=median_trip), size=3, alpha=.8) + scale_color_gradient2(low = ("red"), mid = "white",
high = ("blue"), midpoint = 500)
## Warning: Removed 5 rows containing missing values (geom_point).
Lets compare the different habits of female vs male ridership throughout the city.
stationByGender <- citi %>%
filter(gender != "Unspecified") %>%
group_by(start.station.name, start.station.latitude, start.station.longitude) %>%
summarise(pct.female = mean(gender == "Female"))
ggmap(nyc) + geom_point(data=stationByGender, aes(x=start.station.longitude, y=start.station.latitude, color=pct.female)) + geom_point(size=6, alpha=1)+ scale_color_gradient2()
## Warning: Removed 5 rows containing missing values (geom_point).